# coding=gbk
# https://weibo.com/ajax/statuses/buildComments?flow=0&is_reload=1&id=4403396993644448&is_show_bulletin=2&is_mix=0&max_id=139524556292235&count=20&uid=2656274875
#https://m.weibo.cn/comments/hotflow?id=4404093797907178&mid=4404093797907178&max_id_type=0
#import os

#import pandas as pd
#import requests
#from bs4 import BeautifulSoup

#
# def fetchUrl():
#     # url
#     url = "https://m.weibo.cn/comments/hotflow"
#     # 请求头
#     headers = {
#
#         "user-agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.64 Safari/537.36 Edg/101.0.1210.47"
#     }
#     # 参数
#     params = {
#
#         "id": 4404093797907178,
#         "mid": 4404093797907178,
#         "max_id_type": 0,
#     }
#
#     r = requests.get(url, headers=headers, params=params)
#
#     return r.json()
#
#
# def parseJson(jsonObj):
#     data = jsonObj["data"]
#     max_id = jsonObj["max_id"]
#
#     commentData = []
#     for item in data:
#         # 评论id
#         comment_Id = item["id"]
#         # 评论内容
#         content = BeautifulSoup(item["text"], "html.parser").text
#         # 评论时间
#         created_at = item["created_at"]
#         # 点赞数
#         like_counts = item["like_counts"]
#         # 评论数
#         total_number = item["total_number"]
#
#         # 评论者 id，name，city
#         user = item["user"]
#         userID = user["id"]
#         userName = user["name"]
#         userCity = user["location"]
#
#         dataItem = [comment_Id, created_at, userID, userName, userCity, like_counts, total_number, content]
#         print(dataItem)
#         commentData.append(dataItem)
#
#     return commentData, max_id
#
#
# def save_data(data, path, filename):
#     if not os.path.exists(path):
#         os.makedirs(path)
#
#     dataframe = pd.DataFrame(data)
#     dataframe.to_csv(path + filename, encoding='utf_8_sig', mode='a', index=False, sep=',', header=False)
#
#
# if __name__ == "__main__":
#
#     pid = 4717939545342043  # 微博id，固定
#     uid = 6512991534  # 用户id，固定
#     max_id = 0
#     path = "G:/py/"  # 保存的路径
#     filename = "comments.csv"  # 保存的文件名
#
#     csvHeader = [["评论id", "发布时间", "用户id", "用户昵称", "用户城市", "点赞数", "回复数", "评论内容"]]
#     save_data(csvHeader, path, filename)
#
#     while (True):
#         html = fetchUrl(pid, uid, max_id)
#         comments, max_id = parseJson(html)
#         save_data(comments, path, filename)
#         # max_id 为 0 时，表示爬取结束
#         if max_id == 0:
#             break;
# -*- coding:utf-8 -*-
# @time: 2021/5/11 19:00
# @Author: 韩国麦当劳
# @Environment: Python 3.7
# @file: 微博评论.py

# import json
# import csv
# import re
# import requests
# import time
#
#
# # 获取网页源码的文本文件
# def get_html(url):
#     headers = {
#         "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36",
#         "Referer": "https://m.weibo.cn"
#     }
#     cookies = {
#         "cookie": "ySUB=_2A25PhhIADeRhGeBO61IT9CnEyD2IHXVsiL5IrDV6PUJbktANLRbYkW1NSkBchgauJa4UR3g6budAM0kdkvEaRfTk; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9Wh.OcAnJyRs55-Kox512Vig5NHD95Qceh57eoBN1hepWs4DqcjZCJ8_dPLz9g4EwBtt; _T_WM=57365897740; WEIBOCN_FROM=1110006030; MLOGIN=1; XSRF-TOKEN=329971; mweibo_short_token=c12fc604a4; M_WEIBOCN_PARAMS=luicode%3D10000011%26lfid%3D100103type%253D38%2526q%253D%25E5%258F%25B0%25E9%25A3%258E%25E5%2588%25A9%25E5%25A5%2587%25E9%25A9%25AC%2526t%253D0%26uicode%3D10000011%26fid%3D231522type%253D1%2526t%253D10%2526q%253D%2523%25E5%258F%25B0%25E9%25A3%258E%25E5%2588%25A9%25E5%25A5%2587%25E9%25A9%25AC%2523%26oid%3D4403396993644448"
#     }
#     response = requests.get(url, headers=headers, cookies=cookies)
#     response.encoding = response.apparent_encoding
#     time.sleep(3)   # 加上3s 的延时防止被反爬
#     print(response.text)
#     return response.text
#
#
# def get_string(text):
#     t = ''
#     flag = 1
#     for i in text:
#         if i == '<':
#             flag = 0
#         elif i == '>':
#             flag = 1
#         elif flag == 1:
#             t += i
#     return t
#
#
# # 保存评论
# def save_text_data(text_data):
#     text_data = get_string(text_data)
#     with open("data.csv", "a", encoding="utf-8", newline="")as fi:
#         fi = csv.writer(fi)
#         fi.writerow([text_data])
#
#
# # 获取二级评论
# def get_second_comments(cid):
#     max_id = 0
#     max_id_type = 0
#     url = 'https://m.weibo.cn/comments/hotFlowChild?cid={}&max_id={}&max_id_type={}'
#     while True:
#         response = get_html(url.format(cid, max_id, max_id_type))
#         content = json.loads(response)
#         comments = content['data']
#         for i in comments:
#             text_data = i['text']
#             save_text_data(text_data)
#         max_id = content['max_id']
#         max_id_type = content['max_id_type']
#         if max_id == 0:		# 如果max_id==0表明评论已经抓取完毕了
#             break
#
#
# # 获取一级评论
# def get_first_comments(mid):
#     max_id = 0
#     max_id_type = 0
#     url = 'https://m.weibo.cn/comments/hotflow?id={}&mid={}&max_id={}&max_id_type={}'
#     while True:
#         response = get_html(url.format(mid, mid, max_id, max_id_type))
#         print(response)
#         content = json.loads(response)
#         max_id = content['data']['max_id']
#         max_id_type = content['data']['max_id_type']
#         text_list = content['data']['data']
#         for text in text_list:
#             text_data = text['text']
#             total_number = text['total_number']
#             if int(total_number) != 0:  # 如果有二级评论就去获取二级评论。
#                 get_second_comments(text['id'])
#             save_text_data(text_data)
#         if int(max_id) == 0:    # 如果max_id==0表明评论已经抓取完毕了
#             break
#
#
# if __name__ == '__main__':
#     mid = ["4404093797907178"]
#     for id in mid:
#         get_first_comments(id)    # 爬取一级评论
# -*- coding: utf-8 -*-
#coding=utf-8
import requests
import time
import os
import csv
import sys
import json
from bs4 import BeautifulSoup
import importlib
importlib.reload(sys)

# 要爬取热评的起始url
url = 'https://m.weibo.cn/comments/hotflow?id=4404093797907178&mid=4404093797907178&max_id='
headers = {
    'Cookie': 'SUB=_2A25PhhIADeRhGeBO61IT9CnEyD2IHXVsiL5IrDV6PUJbktANLRbYkW1NSkBchgauJa4UR3g6budAM0kdkvEaRfTk; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9Wh.OcAnJyRs55-Kox512Vig5NHD95Qceh57eoBN1hepWs4DqcjZCJ8_dPLz9g4EwBtt; _T_WM=57365897740; WEIBOCN_FROM=1110006030; MLOGIN=1; M_WEIBOCN_PARAMS=oid%3D4403396993644448%26luicode%3D10000011%26lfid%3D231522type%253D1%2526t%253D10%2526q%253D%2523%25E5%258F%25B0%25E9%25A3%258E%25E5%2588%25A9%25E5%25A5%2587%25E9%25A9%25AC%2523; XSRF-TOKEN=7c791f',
    'Referer': 'https://m.weibo.cn/detail/4404093797907178',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.64 Safari/537.36 Edg/101.0.1210.47',
    'X-Requested-With': 'XMLHttpRequest'
}

def get_page(max_id, id_type):
    params = {
        'max_id': max_id,
        'max_id_type': id_type
    }
    try:
        r = requests.get(url, params=params, headers=headers)
        if r.status_code == 200:
            return r.json()
    except requests.ConnectionError as e:
        print('error', e.args)


def parse_page(jsondata):
    if jsondata:
        items = jsondata.get('data')
        item_max_id = {}
        item_max_id['max_id'] = items['max_id']
        item_max_id['max_id_type'] = items['max_id_type']
        return item_max_id

def write_csv(jsondata):
    datas = jsondata.get('data').get('data')
    for data in datas:
        created_at = data.get("created_at")
        like_count = data.get("like_count")
        source = data.get("source")
        floor_number = data.get("floor_number")
        username = data.get("user").get("screen_name")
        comment = data.get("text")
        comment = BeautifulSoup(comment, 'lxml').get_text()
        writer.writerow([username, created_at, like_count, floor_number, source,
                         json.dumps(comment,  ensure_ascii=False)])

# 存为csv
path = os.getcwd() + "/weiboComments1.csv"
csvfile = open(path, 'w',encoding = 'utf-8')
writer = csv.writer(csvfile)
writer.writerow(['Usename', 'Time', 'Like_count', 'Floor_number', 'Sourse', 'Comments'])

maxpage = 50 #爬取的数量
m_id = 0
id_type = 0
for page in range(0, maxpage):
    print(page)
    jsondata = get_page(m_id, id_type)
    write_csv(jsondata)
    results = parse_page(jsondata)
    time.sleep(1)
    m_id = results['max_id']
    id_type = results['max_id_type']
